In [1]:

    
import numpy as np
import scipy as sc
import matplotlib.pyplot as plt
from prettyprint import pp
import os, re
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.grid_search import GridSearchCV
from datetime import datetime as dt
from ipy_table import *
%matplotlib inline

Files directory



In [2]:

    
root_path = 'E:/University Central/Modern Information Retrieval/Project/Project Phase 2/20_newsgroup/'
#top_view folders
folders = [root_path + folder + '/' for folder in os.listdir(root_path)]


#there are only 4 classes
class_titles = os.listdir(root_path)


#list of all the files belonging to each class
files = {}
for folder, title in zip(folders, class_titles):
    files[title] = [folder + f for f in os.listdir(folder)]

Split documents to test and train sets



In [3]:

    
train_test_ratio = 0.75

def train_test_split(ratio, classes, files):
    """
    this method will split the input list of files to train and test sets.
    *Note: currently this method uses the simplest way an array can be split in two parts.
    Parameters
    ----------
    ratio: float
           ratio of total documents in each class assigned to the training set
    classes: list
             list of label classes
    files: dictionary
           a dictionary with list of files for each class
    
    Returns
    -------
    train_dic: dictionary
                a dictionary with lists of documents in the training set for each class
    test_dict: dictionary
                a dictionary with lists of documents in the testing set for each class
    """
    train_dict = {}
    test_dict = {}
    for cl in classes:
        train_cnt = int(ratio * len(files[cl]))
        train_dict[cl] = files[cl][:train_cnt]
        test_dict[cl] = files[cl][train_cnt:]
    return train_dict, test_dict



In [4]:

    
train_path, test_path = train_test_split(train_test_ratio, class_titles, files)

Cleanup text



In [5]:

    
pattern = re.compile(r'([a-zA-Z]+|[0-9]+(\.[0-9]+)?)')

def cleanupText(path):
    """
    this method will read in a text file and try to cleanup its text.
    
    Parameters
    ----------
    path: str
          path to the document file
    Returns
    -------
    text_translated: str
                     cleaned up version of the raw text in the input file
    """
    from string import punctuation, digits
    text_translated = ''
    try:
        f = open(path)
        raw = f.read().lower()
        text = pattern.sub(r' \1 ', raw.replace('\n', ' '))
        text_translated = text.translate(None, punctuation + digits)
        text_translated = ' '.join([word for word in text_translated.split(' ') if (word and len(word) > 1)])
    finally:
        f.close()
    return text_translated

Create arrays of documents and their corresponding labels



In [6]:

    
train_arr = []
test_arr = []
train_lbl = []
test_lbl = []
for cl in class_titles:
    for path in train_path[cl]:
        train_arr.append(cleanupText(path))
        train_lbl.append(cl)
    for path in test_path[cl]:
        test_arr.append(cleanupText(path))
        test_lbl.append(cl)
        
print len(train_arr)
print len(test_arr)

Create text vectorizer



In [7]:

    
vectorizer = CountVectorizer()
vectorizer.fit(train_arr)
train_mat = vectorizer.transform(train_arr)
print train_mat.shape
test_mat = vectorizer.transform(test_arr)
print test_mat.shape









    



(600, 19420)
(200, 19420)

Create Tfidf Transformer



In [8]:

    
tfidf = TfidfTransformer()
tfidf.fit(train_mat)
train_tfmat = tfidf.transform(train_mat)
print train_tfmat.shape
test_tfmat = tfidf.transform(test_mat)
print test_tfmat.shape









    



(600, 19420)
(200, 19420)

Test Classifier



In [9]:

    
def testClassifier(x_train, y_train, x_test, y_test, clf):
    """
    this method will first train the classifier on the training data
    and will then test the trained classifier on test data.
    Finally it will report some metrics on the classifier performance.
    
    Parameters
    ----------
    x_train: np.ndarray
             train data matrix
    y_train: list
             train data label
    x_test: np.ndarray
            test data matrix
    y_test: list
            test data label
    clf: sklearn classifier object implementing fit() and predict() methods
    
    Returns
    -------
    metrics: list
             [training time, testing time, recall and precision for every class, macro-averaged F1 score]
    """
    metrics = []
    start = dt.now()
    clf.fit(x_train, y_train)
    end = dt.now()
    print 'training time: ', (end - start)
    
    # add training time to metrics
    metrics.append(end-start)
    
    start = dt.now()
    yhat = clf.predict(x_test)
    end = dt.now()
    print 'testing time: ', (end - start)
    
    # add testing time to metrics
    metrics.append(end-start)
    
    print 'classification report: '
#     print classification_report(y_test, yhat)
    pp(classification_report(y_test, yhat))
    
    print 'f1 score'
    print f1_score(y_test, yhat, average='macro')
    
    print 'accuracy score'
    print accuracy_score(y_test, yhat)
    
    precision = precision_score(y_test, yhat, average=None)
    recall = recall_score(y_test, yhat, average=None)
    
    # add precision and recall values to metrics
    for p, r in zip(precision, recall):
        metrics.append(p)
        metrics.append(r)
    
    
    #add macro-averaged F1 score to metrics
    metrics.append(f1_score(y_test, yhat, average='macro'))
    
    print 'confusion matrix:'
    print confusion_matrix(y_test, yhat)
    
    # plotting the confusion matrix
    plt.imshow(confusion_matrix(y_test, yhat), interpolation='nearest')
    plt.show()
    
    return metrics

Metrics list



In [10]:

    
metrics_dict = []
#'name', 'metrics'

First, classifiers with default settings will be tested

Naive Bayes classifiers

Bernoulli Naive Bayes classifier



In [11]:

    
bnb = BernoulliNB()
bnb_me = testClassifier(train_tfmat, train_lbl, test_tfmat, test_lbl, bnb)
metrics_dict.append({'name':'BernoulliNB', 'metrics':bnb_me})









    



training time:  0:00:00.007000
testing time:  0:00:00.004000
classification report: 
"             precision    recall  f1-score   support

comp.os.ms-windows.misc       1.00      0.38      0.55        50
rec.sport.baseball       1.00      1.00      1.00        50
sci.electronics       0.60      0.98      0.74        50
soc.religion.christian       0.98      0.96      0.97        50

avg / total       0.89      0.83      0.82       200
"
f1 score
0.815711462451
accuracy score
0.83
confusion matrix:
[[19  0 31  0]
 [ 0 50  0  0]
 [ 0  0 49  1]
 [ 0  0  2 48]]

Gaussian Naive Bayes classifier



In [12]:

    
gnb = GaussianNB()
gnb_me = testClassifier(train_tfmat.toarray(), train_lbl, test_tfmat.toarray(), test_lbl, gnb)
metrics_dict.append({'name':'GaussianNB', 'metrics':gnb_me})









    



training time:  0:00:00.404000
testing time:  0:00:00.354000
classification report: 
"             precision    recall  f1-score   support

comp.os.ms-windows.misc       0.83      0.86      0.84        50
rec.sport.baseball       0.94      0.96      0.95        50
sci.electronics       0.89      0.80      0.84        50
soc.religion.christian       0.96      1.00      0.98        50

avg / total       0.90      0.91      0.90       200
"
f1 score
0.904032431107
accuracy score
0.905
confusion matrix:
[[43  2  4  1]
 [ 1 48  1  0]
 [ 8  1 40  1]
 [ 0  0  0 50]]

Multinomial Naive Bayes classifier



In [13]:

    
mnb = MultinomialNB()
mnb_me = testClassifier(train_tfmat.toarray(), train_lbl, test_tfmat.toarray(), test_lbl, mnb)
metrics_dict.append({'name':'MultinomialNB', 'metrics':mnb_me})









    



training time:  0:00:00.178000
testing time:  0:00:00.015000
classification report: 
"             precision    recall  f1-score   support

comp.os.ms-windows.misc       0.98      0.98      0.98        50
rec.sport.baseball       0.98      0.98      0.98        50
sci.electronics       0.98      0.92      0.95        50
soc.religion.christian       0.94      1.00      0.97        50

avg / total       0.97      0.97      0.97       200
"
f1 score
0.969831848664
accuracy score
0.97
confusion matrix:
[[49  0  0  1]
 [ 0 49  1  0]
 [ 1  1 46  2]
 [ 0  0  0 50]]

K Nearest Neighbors classifier



In [14]:

    
# for nn in [5, 10, 15]:
for nn in [5]:
    print 'knn with ', nn, ' neighbors'
    knn = KNeighborsClassifier(n_neighbors=nn)
    knn_me = testClassifier(train_tfmat, train_lbl, test_tfmat, test_lbl, knn)
    metrics_dict.append({'name':'5NN', 'metrics':knn_me})
    print ' '









    



knn with  5  neighbors
training time:  0:00:00.001000
testing time:  0:00:00.039000
classification report: 
"             precision    recall  f1-score   support

comp.os.ms-windows.misc       0.74      0.90      0.81        50
rec.sport.baseball       0.80      0.74      0.77        50
sci.electronics       0.84      0.62      0.71        50
soc.religion.christian       0.89      1.00      0.94        50

avg / total       0.82      0.81      0.81       200
"
f1 score
0.80942101218
accuracy score
0.815
confusion matrix:
[[45  0  3  2]
 [10 37  3  0]
 [ 6  9 31  4]
 [ 0  0  0 50]]

SVM classifier

Linear SVM



In [15]:

    
lsvm = LinearSVC()
lsvm_me = testClassifier(train_tfmat, train_lbl, test_tfmat, test_lbl, lsvm)
metrics_dict.append({'name':'LinearSVM', 'metrics':lsvm_me})









    



training time:  0:00:00.038000
testing time:  0:00:00
classification report: 
"             precision    recall  f1-score   support

comp.os.ms-windows.misc       1.00      1.00      1.00        50
rec.sport.baseball       1.00      0.98      0.99        50
sci.electronics       0.98      1.00      0.99        50
soc.religion.christian       1.00      1.00      1.00        50

avg / total       1.00      0.99      0.99       200
"
f1 score
0.99499949995
accuracy score
0.995
confusion matrix:
[[50  0  0  0]
 [ 0 49  1  0]
 [ 0  0 50  0]
 [ 0  0  0 50]]

$\nu$-SVM



In [16]:

    
nusvm = NuSVC()
nusvm_me = testClassifier(train_tfmat, train_lbl, test_tfmat, test_lbl, nusvm)
metrics_dict.append({'name':'nuSVM', 'metrics':nusvm_me})









    



training time:  0:00:00.383000
testing time:  0:00:00.079000
classification report: 
"             precision    recall  f1-score   support

comp.os.ms-windows.misc       1.00      0.84      0.91        50
rec.sport.baseball       0.92      0.98      0.95        50
sci.electronics       0.85      0.92      0.88        50
soc.religion.christian       0.98      1.00      0.99        50

avg / total       0.94      0.94      0.93       200
"
f1 score
0.934803545864
accuracy score
0.935
confusion matrix:
[[42  1  7  0]
 [ 0 49  1  0]
 [ 0  3 46  1]
 [ 0  0  0 50]]

SVM with RBF kernel



In [17]:

    
rbfsvm = SVC()
rbfsvm_me = testClassifier(train_tfmat, train_lbl, test_tfmat, test_lbl, rbfsvm)
metrics_dict.append({'name':'SVM with RBF kernel', 'metrics':rbfsvm_me})









    



training time:  0:00:00.743000
testing time:  0:00:00.153000
classification report: 
"             precision    recall  f1-score   support

comp.os.ms-windows.misc       1.00      0.52      0.68        50
rec.sport.baseball       1.00      0.92      0.96        50
sci.electronics       0.64      1.00      0.78        50
soc.religion.christian       1.00      1.00      1.00        50

avg / total       0.91      0.86      0.86       200
"
f1 score
0.855948464912
accuracy score
0.86
confusion matrix:
[[26  0 24  0]
 [ 0 46  4  0]
 [ 0  0 50  0]
 [ 0  0  0 50]]

Second, best classifier of each kind will be found using 5 fold cross validation with a search on parameter grid

Naive Bayes classifiers

Best Bernoulli Naive Bayes classifier

Parameters

alpha : float, optional (default=1.0)

Additive (Laplace/Lidstone) smoothing parameter
(0 for no smoothing).

binarize : float or None, optional

Threshold for binarizing (mapping to booleans) of sample features.
If None, input is presumed to already consist of binary vectors.

fit_prior : boolean

Whether to learn class prior probabilities or not.
If false, a uniform prior will be used.

class_prior : array-like, size=[n_classes,]

Prior probabilities of the classes. If specified the priors are not
adjusted according to the data.

Note: since classes are balanced, their priors are equal.



In [21]:

    
bnb_params = {'alpha': [a*0.1 for a in range(0,11)]}
bnb_clf = GridSearchCV(BernoulliNB(), bnb_params, cv=10)
bnb_clf.fit(train_tfmat, train_lbl)
print 'best parameters'
print bnb_clf.best_params_
best_bnb = BernoulliNB(alpha=bnb_clf.best_params_['alpha'])
best_bnb_me = testClassifier(train_tfmat, train_lbl, test_tfmat, test_lbl, best_bnb)
metrics_dict.append({'name':'Best BernoulliNB', 'metrics':best_bnb_me})









    



best parameters
{'alpha': 0.1}
training time:  0:00:00.004000
testing time:  0:00:00.003000
classification report: 
"             precision    recall  f1-score   support

comp.os.ms-windows.misc       1.00      0.74      0.85        50
rec.sport.baseball       1.00      1.00      1.00        50
sci.electronics       0.79      1.00      0.88        50
soc.religion.christian       1.00      1.00      1.00        50

avg / total       0.95      0.94      0.93       200
"
f1 score
0.933882616214
accuracy score
0.935
confusion matrix:
[[37  0 13  0]
 [ 0 50  0  0]
 [ 0  0 50  0]
 [ 0  0  0 50]]

Best Gaussian Naive Bayes classifier

Note: this classifier doesn't have any paramters, so no cross validation and grid search on its parameters can be performed.



In [22]:

    
best_gnb = GaussianNB()
best_gnb_me = testClassifier(train_tfmat.toarray(), train_lbl, test_tfmat.toarray(), test_lbl, best_gnb)
metrics_dict.append({'name':'Best GaussianNB', 'metrics':best_gnb_me})









    



training time:  0:00:00.393000
testing time:  0:00:00.271000
classification report: 
"             precision    recall  f1-score   support

comp.os.ms-windows.misc       0.83      0.86      0.84        50
rec.sport.baseball       0.94      0.96      0.95        50
sci.electronics       0.89      0.80      0.84        50
soc.religion.christian       0.96      1.00      0.98        50

avg / total       0.90      0.91      0.90       200
"
f1 score
0.904032431107
accuracy score
0.905
confusion matrix:
[[43  2  4  1]
 [ 1 48  1  0]
 [ 8  1 40  1]
 [ 0  0  0 50]]

Best Multinomial Bayes classifier

Parameters

alpha : float, optional (default=1.0)

Additive (Laplace/Lidstone) smoothing parameter
(0 for no smoothing).

fit_prior : boolean

Whether to learn class prior probabilities or not.
If false, a uniform prior will be used.

class_prior : array-like, size (n_classes,)

Prior probabilities of the classes. If specified the priors are not
adjusted according to the data.



In [25]:

    
mbn_params = {'alpha': [a*0.1 for a in range(0,11)]}
mbn_clf = GridSearchCV(MultinomialNB(), mbn_params, cv=10)
mbn_clf.fit(train_tfmat, train_lbl)
print 'best parameters'
print mbn_clf.best_params_
best_mbn = MultinomialNB(alpha=mbn_clf.best_params_['alpha'])
best_mbn_me = testClassifier(train_tfmat, train_lbl, test_tfmat, test_lbl, best_mbn)
metrics_dict.append({'name':'Best MultinomialNB', 'metrics':best_mbn_me})









    



best parameters
{'alpha': 0.2}
training time:  0:00:00.003000
testing time:  0:00:00.001000
classification report: 
"             precision    recall  f1-score   support

comp.os.ms-windows.misc       0.98      0.98      0.98        50
rec.sport.baseball       0.98      1.00      0.99        50
sci.electronics       0.98      0.96      0.97        50
soc.religion.christian       1.00      1.00      1.00        50

avg / total       0.98      0.98      0.98       200
"
f1 score
0.984948994899
accuracy score
0.985
confusion matrix:
[[49  0  1  0]
 [ 0 50  0  0]
 [ 1  1 48  0]
 [ 0  0  0 50]]

Best KNN classifier

Parameters

n_neighbors : int, optional (default = 5)

Number of neighbors to use by default for :meth:`k_neighbors` queries.

weights : str or callable

weight function used in prediction.  Possible values:

- 'uniform' : uniform weights.  All points in each neighborhood
  are weighted equally.
- 'distance' : weight points by the inverse of their distance.
  in this case, closer neighbors of a query point will have a
  greater influence than neighbors which are further away.
- [callable] : a user-defined function which accepts an
  array of distances, and returns an array of the same shape
  containing the weights.

Uniform weights are used by default.

algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional

Algorithm used to compute the nearest neighbors:

- 'ball_tree' will use :class:`BallTree`
- 'kd_tree' will use :class:`KDTree`
- 'brute' will use a brute-force search.
- 'auto' will attempt to decide the most appropriate algorithm
  based on the values passed to :meth:`fit` method.

Note: fitting on sparse input will override the setting of
this parameter, using brute force.

leaf_size : int, optional (default = 30)

Leaf size passed to BallTree or KDTree.  This can affect the
speed of the construction and query, as well as the memory
required to store the tree.  The optimal value depends on the
nature of the problem.

metric : string or DistanceMetric object (default='minkowski')

the distance metric to use for the tree.  The default metric is
minkowski, and with p=2 is equivalent to the standard Euclidean
metric. See the documentation of the DistanceMetric class for a
list of available metrics.

p : integer, optional (default = 2)

Power parameter for the Minkowski metric. When p = 1, this is
equivalent to using manhattan_distance (l1), and euclidean_distance
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.



In [28]:

    
knn_params = {'n_neighbors': range(1,21), 'weights': ['uniform', 'distance'], 'algorithm': ['ball_tree', 'kd_tree'],
              'leaf_size': [15, 30, 50, 100], 'p': [1,2]}
knn_clf = GridSearchCV(KNeighborsClassifier(), knn_params, cv=10)
knn_clf.fit(train_tfmat, train_lbl)
print 'best parameters'
print knn_clf.best_params_
best_knn = KNeighborsClassifier(n_neighbors=knn_clf.best_params_['n_neighbors'], weights=knn_clf.best_params_['weights'],
                                algorithm=knn_clf.best_params_['algorithm'], leaf_size=knn_clf.best_params_['leaf_size'])
best_knn_me = testClassifier(train_tfmat, train_lbl, test_tfmat, test_lbl, best_knn)
metrics_dict.append({'name':'Best KNN', 'metrics':best_knn_me})









    



best parameters
{'n_neighbors': 4, 'weights': 'distance', 'leaf_size': 15, 'algorithm': 'ball_tree', 'p': 1}
training time:  0:00:00
testing time:  0:00:00.031000
classification report: 
"             precision    recall  f1-score   support

comp.os.ms-windows.misc       0.71      0.84      0.77        50
rec.sport.baseball       0.80      0.70      0.74        50
sci.electronics       0.80      0.66      0.73        50
soc.religion.christian       0.89      1.00      0.94        50

avg / total       0.80      0.80      0.80       200
"
f1 score
0.795998501147
accuracy score
0.8
confusion matrix:
[[42  2  4  2]
 [11 35  4  0]
 [ 6  7 33  4]
 [ 0  0  0 50]]

Best Linear SVM classifier

Parameters

C : float, optional (default=1.0)

Penalty parameter C of the error term.

loss : string, 'l1' or 'l2' (default='l2')

Specifies the loss function. 'l1' is the hinge loss (standard SVM)
while 'l2' is the squared hinge loss.

penalty : string, 'l1' or 'l2' (default='l2')

Specifies the norm used in the penalization. The 'l2'
penalty is the standard used in SVC. The 'l1' leads to `coef_`
vectors that are sparse.

dual : bool, (default=True)

Select the algorithm to either solve the dual or primal
optimization problem. Prefer dual=False when n_samples > n_features.

tol : float, optional (default=1e-4)

Tolerance for stopping criteria

multi_class: string, 'ovr' or 'crammer_singer' (default='ovr')

Determines the multi-class strategy if `y` contains more than
two classes.
`ovr` trains n_classes one-vs-rest classifiers, while `crammer_singer`
optimizes a joint objective over all classes.
While `crammer_singer` is interesting from an theoretical perspective
as it is consistent it is seldom used in practice and rarely leads to
better accuracy and is more expensive to compute.
If `crammer_singer` is chosen, the options loss, penalty and dual will
be ignored.

fit_intercept : boolean, optional (default=True)

Whether to calculate the intercept for this model. If set
to false, no intercept will be used in calculations
(e.g. data is expected to be already centered).

intercept_scaling : float, optional (default=1)

when self.fit_intercept is True, instance vector x becomes
[x, self.intercept_scaling],
i.e. a "synthetic" feature with constant value equals to
intercept_scaling is appended to the instance vector.
The intercept becomes intercept_scaling * synthetic feature weight
Note! the synthetic feature weight is subject to l1/l2 regularization
as all other features.
To lessen the effect of regularization on synthetic feature weight
(and therefore on the intercept) intercept_scaling has to be increased

class_weight : {dict, 'auto'}, optional

Set the parameter C of class i to class_weight[i]*C for
SVC. If not given, all classes are supposed to have
weight one. The 'auto' mode uses the values of y to
automatically adjust weights inversely proportional to
class frequencies.

verbose : int, default: 0

Enable verbose output. Note that this setting takes advantage of a
per-process runtime setting in liblinear that, if enabled, may not work
properly in a multithreaded context.

random_state : int seed, RandomState instance, or None (default)

The seed of the pseudo random number generator to use when
shuffling the data.



In [29]:

    
lsvm_params = {'C':[1,10,100,1000], 'loss':['l1', 'l2']}
lsvm_clf = GridSearchCV(LinearSVC(), lsvm_params, cv=5)
lsvm_clf.fit(train_tfmat, train_lbl)
print 'best parameters'
print lsvm_clf.best_params_
best_lsvm = LinearSVC(C=lsvm_clf.best_params_['C'], loss=lsvm_clf.best_params_['loss'])
best_lsvm_me = testClassifier(train_tfmat, train_lbl, test_tfmat, test_lbl, best_lsvm)
metrics_dict.append({'name':'Best Linear SVM', 'metrics':best_lsvm_me})









    



best parameters
{'loss': 'l1', 'C': 10}
training time:  0:00:00.096000
testing time:  0:00:00
classification report: 
"             precision    recall  f1-score   support

comp.os.ms-windows.misc       1.00      1.00      1.00        50
rec.sport.baseball       1.00      0.98      0.99        50
sci.electronics       0.98      1.00      0.99        50
soc.religion.christian       1.00      1.00      1.00        50

avg / total       1.00      0.99      0.99       200
"
f1 score
0.99499949995
accuracy score
0.995
confusion matrix:
[[50  0  0  0]
 [ 0 49  1  0]
 [ 0  0 50  0]
 [ 0  0  0 50]]

Creating a table summarizing metrics of the classifiers



In [31]:

    
metrics_table = []
metrics_table.append(['', 'name', 'training time', 'testing time',
                      'p_1', 'r_1',
                      'p_2', 'r_2',
                      'p_3', 'r_3',
                      'p_4', 'r_4',
                      'macro-averaged F1 score'
                      ])
i = 0
for me in metrics_dict:
    i += 1
    metric = []
    metric.append(i)
    metric.append(me['name'])
    for m in me['metrics']:
        metric.append(m)
    metrics_table.append(metric)
make_table(metrics_table)

# styling
apply_theme('basic_both')
set_column_style(12, align='center')









    Out[31]:




name training&nbsptime testing&nbsptime p_1 r_1 p_2 r_2 p_3 r_3 p_4 r_4 macro-averaged&nbspF1&nbspscore
1 BernoulliNB 0:00:00.007000 0:00:00.004000 1.0000 0.3800 1.0000 1.0000 0.5976 0.9800 0.9796 0.9600 0.8157
2 GaussianNB 0:00:00.404000 0:00:00.354000 0.8269 0.8600 0.9412 0.9600 0.8889 0.8000 0.9615 1.0000 0.9040
3 MultinomialNB 0:00:00.178000 0:00:00.015000 0.9800 0.9800 0.9800 0.9800 0.9787 0.9200 0.9434 1.0000 0.9698
4 5NN 0:00:00.001000 0:00:00.039000 0.7377 0.9000 0.8043 0.7400 0.8378 0.6200 0.8929 1.0000 0.8094
5 LinearSVM 0:00:00.038000 0:00:00 1.0000 1.0000 1.0000 0.9800 0.9804 1.0000 1.0000 1.0000 0.9950
6 nuSVM 0:00:00.383000 0:00:00.079000 1.0000 0.8400 0.9245 0.9800 0.8519 0.9200 0.9804 1.0000 0.9348
7 SVM&nbspwith&nbspRBF&nbspkernel 0:00:00.743000 0:00:00.153000 1.0000 0.5200 1.0000 0.9200 0.6410 1.0000 1.0000 1.0000 0.8559
8 Best&nbspBernoulliNB 0:00:00.004000 0:00:00.003000 1.0000 0.7400 1.0000 1.0000 0.7937 1.0000 1.0000 1.0000 0.9339
9 Best&nbspGaussianNB 0:00:00.393000 0:00:00.271000 0.8269 0.8600 0.9412 0.9600 0.8889 0.8000 0.9615 1.0000 0.9040
10 Best&nbspMultinomialNB 0:00:00.003000 0:00:00.001000 0.9800 0.9800 0.9804 1.0000 0.9796 0.9600 1.0000 1.0000 0.9849
11 Best&nbspKNN 0:00:00 0:00:00.031000 0.7119 0.8400 0.7955 0.7000 0.8049 0.6600 0.8929 1.0000 0.7960
12 Best&nbspLinear&nbspSVM 0:00:00.096000 0:00:00 1.0000 1.0000 1.0000 0.9800 0.9804 1.0000 1.0000 1.0000 0.9950

	name	training&nbsptime	testing&nbsptime	p_1	r_1	p_2	r_2	p_3	r_3	p_4	r_4	macro-averaged&nbspF1&nbspscore
1	BernoulliNB	0:00:00.007000	0:00:00.004000	1.0000	0.3800	1.0000	1.0000	0.5976	0.9800	0.9796	0.9600	0.8157
2	GaussianNB	0:00:00.404000	0:00:00.354000	0.8269	0.8600	0.9412	0.9600	0.8889	0.8000	0.9615	1.0000	0.9040
3	MultinomialNB	0:00:00.178000	0:00:00.015000	0.9800	0.9800	0.9800	0.9800	0.9787	0.9200	0.9434	1.0000	0.9698
4	5NN	0:00:00.001000	0:00:00.039000	0.7377	0.9000	0.8043	0.7400	0.8378	0.6200	0.8929	1.0000	0.8094
5	LinearSVM	0:00:00.038000	0:00:00	1.0000	1.0000	1.0000	0.9800	0.9804	1.0000	1.0000	1.0000	0.9950
6	nuSVM	0:00:00.383000	0:00:00.079000	1.0000	0.8400	0.9245	0.9800	0.8519	0.9200	0.9804	1.0000	0.9348
7	SVM&nbspwith&nbspRBF&nbspkernel	0:00:00.743000	0:00:00.153000	1.0000	0.5200	1.0000	0.9200	0.6410	1.0000	1.0000	1.0000	0.8559
8	Best&nbspBernoulliNB	0:00:00.004000	0:00:00.003000	1.0000	0.7400	1.0000	1.0000	0.7937	1.0000	1.0000	1.0000	0.9339
9	Best&nbspGaussianNB	0:00:00.393000	0:00:00.271000	0.8269	0.8600	0.9412	0.9600	0.8889	0.8000	0.9615	1.0000	0.9040
10	Best&nbspMultinomialNB	0:00:00.003000	0:00:00.001000	0.9800	0.9800	0.9804	1.0000	0.9796	0.9600	1.0000	1.0000	0.9849
11	Best&nbspKNN	0:00:00	0:00:00.031000	0.7119	0.8400	0.7955	0.7000	0.8049	0.6600	0.8929	1.0000	0.7960
12	Best&nbspLinear&nbspSVM	0:00:00.096000	0:00:00	1.0000	1.0000	1.0000	0.9800	0.9804	1.0000	1.0000	1.0000	0.9950